notebook.community

Edit and run



In [2]:

    
import numpy as np
import numpy.random as npr
import pylab as pl
%matplotlib inline



In [3]:

    
# for reproducibility
npr.seed(2)



In [4]:

    
# generate some synthetic data
num_centers = 10
dim = 2
centers = np.zeros((num_centers,dim))
sizes = (npr.rand(num_centers)+0.5)

num_points = 10000

for i in xrange(1,num_centers):
    centers[i] = centers[i-1] + npr.randn(dim)*3

    
X = np.zeros((num_points,dim))
Y = np.zeros(num_points)
for i in range(num_points):
    c = npr.randint(num_centers)
    X[i] = npr.randn(dim)*sizes[c] + centers[c]
    Y[i] = c



In [5]:

    
pl.scatter(X[:,0],X[:,1],c=Y,linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1],c=range(num_centers))
for i in range(num_centers-1):
    cpts = centers[i:i+2]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')









    Out[5]:





(-8.0, 8.0, -6.0, 10.0)



In [6]:

    
# to do: replace random walk with a branching process



In [7]:

    
from sklearn.neighbors import KernelDensity



In [8]:

    
bandwidth=0.5
kde = KernelDensity(bandwidth)



In [9]:

    
kde.fit(X)
scores = kde.score_samples(X)



In [10]:

    
scores.min(),scores.max()









    Out[10]:





(-8.6331214511766312, -3.3486856194786867)



In [11]:

    
sigmoid = lambda x : (1 / (1 + np.exp(-x)))



In [12]:

    
accept_prob = 1 - (scores - scores.min()) / (scores.max() - scores.min())
accept_prob_s = sigmoid(accept_prob)
print(accept_prob.min(),accept_prob.max())
print(accept_prob_s.min(),accept_prob_s.max())









    



(0.0, 1.0)
(0.5, 0.7310585786300049)



In [13]:

    
pl.scatter(X[:,0],X[:,1],c=(np.log(accept_prob+1)+0.3),linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis('off')









    Out[13]:





(-8.0, 8.0, -6.0, 10.0)



In [14]:

    
down_sampled_X = []
down_sampled_Y = []
down_sampled_i = []
for i in range(num_points):
    if npr.rand() < accept_prob[i]:
        down_sampled_i.append(i)
        down_sampled_X.append(X[i])
        down_sampled_Y.append(Y[i])
down_sampled_X = np.array(down_sampled_X)
down_sampled_Y = np.array(down_sampled_Y)



In [15]:

    
len(X)









    Out[15]:





10000



In [16]:

    
len(down_sampled_X)









    Out[16]:





1718



In [17]:

    
kde_2 = KernelDensity(bandwidth)
kde_2.fit(down_sampled_X)
new_densities = kde_2.score_samples(down_sampled_X)



In [18]:

    
new_densities.shape









    Out[18]:





(1718,)



In [19]:

    
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=-new_densities.max() - new_densities,linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis('off')









    Out[19]:





(-8.0, 8.0, -6.0, 10.0)



In [20]:

    
from sklearn.cluster import AgglomerativeClustering



In [21]:

    
cluster_model = AgglomerativeClustering(10)
cluster_model.fit(down_sampled_X)









    Out[21]:





AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward',
            memory=Memory(cachedir=None), n_clusters=10, n_components=None,
            pooling_func=<function mean at 0x106d98410>)



In [22]:

    
cluster_pred = cluster_model.labels_



In [23]:

    
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=Y[down_sampled_i],linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis









    Out[23]:





<function matplotlib.pyplot.axis>



In [24]:

    
cluster_centers = []
for i in range(len(set(cluster_pred))):
    cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
cluster_centers = np.array(cluster_centers)



In [25]:

    
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')









    Out[25]:





(-8.0, 8.0, -6.0, 10.0)



In [26]:

    
cluster_true = Y[down_sampled_i]



In [27]:

    
from sklearn import metrics



In [28]:

    
# for just the downsampled points
metrics.adjusted_mutual_info_score(cluster_true,cluster_pred)









    Out[28]:





0.6368748721948998



In [29]:

    
def distance(x,y):
    return np.sqrt(np.sum((x-y)**2))



In [30]:

    
def assign_to_nearest_landmark(points,landmarks):
    nearest = np.zeros(len(points))
    for i,point in enumerate(points):
        distances = np.zeros(len(landmarks))
        for j,landmark in enumerate(landmarks):
            distances[j] = distance(point,landmark)
        nearest[i] = np.argmin(distances)
    return nearest



In [31]:

    
upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)



In [32]:

    
pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')









    Out[32]:





(-8.0, 8.0, -6.0, 10.0)



In [33]:

    
# for all points
metrics.adjusted_mutual_info_score(Y,upsampled_cluster_pred)









    Out[33]:





0.61736845718580546



In [34]:

    
from sklearn.cluster import KMeans



In [35]:

    
kmeans_model = KMeans(10)



In [36]:

    
kmeans_model.fit(X)









    Out[36]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)



In [37]:

    
kmeans_pred = kmeans_model.predict(X)
metrics.adjusted_mutual_info_score(Y,kmeans_pred)









    Out[37]:





0.67519307403453466



In [38]:

    
# compute distance graph

import networkx as nx

def distance_graph(centers):
    G = nx.Graph()
    for i in range(len(centers)-1):
        for j in range(i+1,len(centers)):
            G.add_edge(i,j,weight=distance(centers[i],centers[j]))
    return G

G = distance_graph(centers)



In [39]:

    
mst = nx.minimum_spanning_tree(G)



In [40]:

    
mst.edges()









    Out[40]:





[(0, 7), (1, 5), (2, 3), (2, 5), (2, 7), (4, 5), (5, 6), (7, 8), (8, 9)]



In [41]:

    
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=Y[down_sampled_i],linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1]);
for e in mst.edges():
    cpts = centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')









    Out[41]:





(-8.0, 8.0, -6.0, 10.0)



In [42]:

    
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)

pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')









    Out[42]:





(-8.0, 8.0, -6.0, 10.0)



In [43]:

    
occupancy = np.array([sum(upsampled_cluster_pred==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())



In [44]:

    
norm_occupancy.max()









    Out[44]:





1



In [45]:

    
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1],
           c=cluster_centers[:,0],s=100+(200*norm_occupancy));

pl.axis('off')









    Out[45]:





(-5.0, 5.0, -4.0, 8.0)



In [46]:

    
occupancy = np.array([sum(Y==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())



In [47]:

    
G = distance_graph(centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1],
           c=centers[:,1],s=100+(200*norm_occupancy));
pl.axis('off')









    Out[47]:





(-5.0, 4.0, -4.0, 8.0)

Re-doing the thing for a hypothesis of 50 clusters instead



In [97]:

    
def SPADE(X,num_clusters=50,kde_bandwidth=0.5,native_layout=False):
    num_points = len(X)
    
    from time import time
    t0 = time()
    print("Estimating density...")
    # estimate density
    kde = KernelDensity(kde_bandwidth)
    kde.fit(X)
    scores = kde.score_samples(X)
    
    print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
    
    # "down-sample"
    print("Down-sampling...")
    accept_prob = 1 - (scores - scores.min()) / (scores.max() - scores.min())
    
    down_sampled_X = []
    down_sampled_i = []
    for i in range(num_points):
        if npr.rand() < accept_prob[i]:
            down_sampled_i.append(i)
            down_sampled_X.append(X[i])
    down_sampled_X = np.array(down_sampled_X)
    
    print("Done! Reduced data from {1} to {2} points. Elapsed time: {0:.2f}s".format(time() - t0,len(X),len(down_sampled_X)))
    
    print("Clustering...")
    # cluster down-sampled data
    cluster_model = AgglomerativeClustering(num_clusters)
    cluster_model.fit(down_sampled_X)
    
    # compute cluster centers
    cluster_pred = cluster_model.labels_
    cluster_centers = []
    for i in range(num_clusters):
        cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
    cluster_centers = np.array(cluster_centers)
    
    print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
    
    print("Up-sampling...")
    
    # "up-sample" (assign all points to nearest cluster center)
    upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)
    
    # compute normalized cluster occupancies
    occupancy = np.array([sum(upsampled_cluster_pred==i) for i in range(num_clusters)])
    norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())
    
    print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
    
    # compute MST
    print("Computing MST...")
    G = distance_graph(cluster_centers)
    mst = nx.minimum_spanning_tree(G)
    
    print("Done! Elapsed time: {0:.2f}s".format(time() - t0))

    # plot SPADE tree
    print("Plotting SPADE tree...")
    if X.shape[1] == 2:
        print("Plotting overlay on data")
        pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
        pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
        for e in mst.edges():
            cpts = cluster_centers[np.array(e)]
            pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
        pl.axis('off')
        
        pl.figure()
    
    # set positions by force-directed layout
    pos = nx.graphviz_layout(mst)
    positions = np.zeros((len(pos),2))
    for p in pos:
        positions[p] = pos[p]
    
    # if the data is 2D, we can optionally use cluster centers
    if X.shape[1] == 2 and native_layout:
        positions = cluster_centers
    
    for e in mst.edges():
        cpts = positions[np.array(e)]
        pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
    #pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
    pl.scatter(positions[:,0],positions[:,1],
               c=cluster_centers[:,0],s=100+(200*norm_occupancy));

    pl.axis('off')
    
    print("Done! Total elapsed time: {0:.2f}s".format(time() - t0))
    
    return cluster_centers,mst



In [86]:

    
results = SPADE(X[:1000])









    



Estimating density...
Done! Elapsed time: 0.05s
Down-sampling...
Done! Reduced data from 1000 to 251 points. Elapsed time: 0.05s
Clustering...
Done! Elapsed time: 0.06s
Up-sampling...
Done! Elapsed time: 0.43s
Computing MST...
Done! Elapsed time: 0.44s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 0.63s



In [ ]:



In [ ]:



In [223]:

    
centers = np.array([[0,0],[1,0],[0,1],[1,1]],dtype=float)
centers -= 0.5
centers = np.vstack((centers,centers*2,centers*3,
                     centers*4,centers*5,centers*6,
                     centers*7,centers*8,centers*9,
                     centers*10,centers*11,centers*12,
                     centers*13,centers*14,
                     [0,0]))



In [224]:

    
centers *= 3



In [225]:

    
kde = KernelDensity()
kde.fit(centers)









    Out[225]:





KernelDensity(algorithm='auto', atol=0, bandwidth=1.0, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)



In [226]:

    
samples = kde.sample(5000)



In [227]:

    
pl.scatter(samples[:,0],samples[:,1],linewidths=0,
           c=kde.score_samples(samples))
pl.axis('off')









    Out[227]:





(-30.0, 30.0, -30.0, 30.0)



In [228]:

    
for i in range(10):
    pl.figure()
    results = SPADE(samples)









    



Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1019 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.04s
Up-sampling...
Done! Elapsed time: 2.74s
Computing MST...
Done! Elapsed time: 2.75s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 2.94s
Estimating density...
Done! Elapsed time: 0.67s
Down-sampling...
Done! Reduced data from 5000 to 1087 points. Elapsed time: 0.67s
Clustering...
Done! Elapsed time: 1.08s
Up-sampling...
Done! Elapsed time: 2.82s
Computing MST...
Done! Elapsed time: 2.83s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.02s
Estimating density...
Done! Elapsed time: 0.69s
Down-sampling...
Done! Reduced data from 5000 to 1033 points. Elapsed time: 0.69s
Clustering...
Done! Elapsed time: 1.03s
Up-sampling...
Done! Elapsed time: 2.79s
Computing MST...
Done! Elapsed time: 2.80s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.22s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1064 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.09s
Up-sampling...
Done! Elapsed time: 2.87s
Computing MST...
Done! Elapsed time: 2.89s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.08s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1087 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.12s
Up-sampling...
Done! Elapsed time: 2.93s
Computing MST...
Done! Elapsed time: 2.95s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.14s
Estimating density...
Done! Elapsed time: 0.69s
Down-sampling...
Done! Reduced data from 5000 to 1046 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.07s
Up-sampling...
Done! Elapsed time: 2.85s
Computing MST...
Done! Elapsed time: 2.86s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.04s
Estimating density...
Done! Elapsed time: 0.66s
Down-sampling...
Done! Reduced data from 5000 to 1025 points. Elapsed time: 0.67s
Clustering...
Done! Elapsed time: 1.01s
Up-sampling...
Done! Elapsed time: 2.79s
Computing MST...
Done! Elapsed time: 2.80s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 2.99s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1045 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.05s
Up-sampling...
Done! Elapsed time: 2.75s
Computing MST...
Done! Elapsed time: 2.77s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 2.95s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1027 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.05s
Up-sampling...
Done! Elapsed time: 2.85s
Computing MST...
Done! Elapsed time: 2.86s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.16s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1051 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.08s
Up-sampling...
Done! Elapsed time: 2.82s
Computing MST...
Done! Elapsed time: 2.83s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.02s



In [72]:

    
cluster_pred = cluster_model.labels_



In [73]:

    
cluster_centers = []
for i in range(len(set(cluster_pred))):
    cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
cluster_centers = np.array(cluster_centers)



In [74]:

    
upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)



In [75]:

    
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')









    Out[75]:





(-8.0, 8.0, -6.0, 10.0)



In [76]:

    
pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')









    Out[76]:





(-8.0, 8.0, -6.0, 10.0)



In [77]:

    
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)

pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')









    Out[77]:





(-8.0, 8.0, -6.0, 10.0)



In [78]:

    
occupancy = np.array([sum(Y==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())



In [79]:

    
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1],
           c=cluster_centers[:,1],s=100+(200*norm_occupancy));
pl.axis('off')









    Out[79]:





(-6.0, 8.0, -6.0, 10.0)



In [135]:

    
#pos = nx.spectral_layout(mst)
pos = nx.graphviz_layout(mst)



In [136]:

    
positions = np.zeros((len(pos),2))
for p in pos:
    positions[p] = pos[p]



In [ ]:



In [140]:

    
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = positions[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(positions[:,0],positions[:,1],
           c=cluster_centers[:,0],
           s=100+(200*norm_occupancy));
pl.axis('off')









    Out[140]:





(-1000.0, 1500.0, -600.0, 600.0)



In [104]:

    
pl.scatter(positions[:,0],positions[:,1])









    Out[104]:





<matplotlib.collections.PathCollection at 0x1084e6150>



In [ ]: